getwd()
rm(list=ls())

library(seqinr) #R package
#used seqinr because it was easier to call for each reading frame; already an argument
#save excel table as tab delimited file .txt
#read table in R; sep=separated by tabs (tab delineated)
merged_exons<- read.table("LSV-junction-database.txt",
                          stringsAsFactors = F,
                          sep="\t",
                          header=T)

#function (called SEQ) to translate each gene sequence per each reading frame 
#only doing three RFs because we know the directionallity of DNA sequence
#tolower because we need characters to be lower case
all_aa_frames<-function(SEQ){
  SEQ<-unlist(strsplit(tolower(SEQ),split=""))
  frames<-list()
  for (FRAME in 0:2){
    this_frame <- translate(seq=SEQ, frame = FRAME, sens = "F", numcode = 1, NAstring = "X", ambiguous = FALSE)
    frames[as.character(FRAME)]<-paste0(this_frame,collapse="")
  }
  return(frames)
}

#>sp|P04406|G3P_HUMAN Glyceraldehyde-3-phosphate dehydrogenase OS=Homo sapiens OX=9606 GN=GAPDH PE=1 SV=3
#following example of header from fasta uniprot
#describe output file format
output_data<- c()
for (row_i in 1:dim(merged_exons)[1]){
  row_data<-merged_exons[row_i,]
  included_seq<-row_data[3]
  included_frames<-all_aa_frames(included_seq)
  for (frame in c("0","1","2")){
    fasta_name_inc<-paste0(">LA|",paste(row_data[1]),"_",paste(row_data[2]),frame,"|", paste(row_data[5])," OS=Homo sapiens GN=", paste(row_data[4]))
                           aa_frame<-as.character(included_frames[frame])
                           output_data<-c(output_data,fasta_name_inc)
                           output_data<-c(output_data,aa_frame)
  }
}

#to delete peptide sequences following first *
please_stop<-gsub(pattern = "\\*.*","",x=output_data)

#to write output file
write(please_stop,"peptide_LSV_junction-usage.txt",sep=",")




